Sys.setenv(LANG = "en")
library(pROC)
Type 'citation("pROC")' for a citation.
Attaching package: ‘pROC’
The following objects are masked from ‘package:stats’:
cov, smooth, var
setwd("C:/Users/halan/Desktop/FPU/SPring2025/StatisticalLearning")
incidents <- read.csv("mass_killing_incidents_public.csv")
offenders <- read.csv("mass_killing_offenders_public.csv")
victims <- read.csv("mass_killing_victims_public.csv")
weapons <- read.csv("mass_killing_weapons_public.csv")
view(incidents)
view(offenders)
view(victims)
view(weapons)
DATA: Mass Killing Database 2006 - Present (AS OF MARCH 15th
2025)
Mass Murder for this dataset is defined as “…the intentional killing
of four or more victims by any means within a 24-hour period, excluding
the death of unborn children and the offender(s).)
This Database is constantly being updated (Was just updated 2 hours
ago as of March 16th, 2025 at 12:50PM) and I would say is very
trustworthy. The contact details for this Database is Justin Myers.
Justin Myers is a veteran journalist for “The Associated Press”, they
are also working with Northeastern University’s James Alan Fox whom is
also a contact.
Features(Incident [19 Columns x 615 Rows]):
Red = Categorical | Green = Numerical
- incident_id: ID of incident
- date: Date of incident
- city: City/Town name
- state: State postal code
- num_offenders: Number of
offenders
- num_killed: Number of victims
killed
- num_injured: Numnber of victims
injured
- firstcod: First cause of death
- secondcod: Second cause of death
- if_assault_rifle: Whether a rifle was
used
- type: Type of incident (ie. Family,
Felony, Public, etc.)
- situation_type: Type of situation
(ie. Arson, Drug Trade, Family Issue, etc.)
- location_type: Type of location (ie.
Commercial/Retail/Entertainment, Government/Transit, House of worship,
etc.)
- location: Location (ie.
Bar/Club/Restaurant, College, Commercial/Retail, etc.)
- longitude
- latitude
- county: County associated with
coordinates
- geocode: FIPS geocode of
location
- narrative: Description of incident
Features(Victims [6 Columns x 3174 Rows]):
- incident_id: ID for incident, used to
link the victims data.
- victim_id: ID of the victim, for
recording purposes.
- age: Age of the victim.
- race
- sex
- vorelationship: Relationship of
victim to the offender
Features(Weapons [5 Columns x 963 Rows]):
- incident_id: ID for incident, used to
link weapon to incident.
- weapon_id: ID for weapon
identification.
- weapon_type: Type of weapon.
- gun_class: Classification of gun
(Handgun, Long gun, Unknown gun class, Non-gun)
- gun_type: Type of gun (ie. Handgun,
Pistol, Revolver, Rifle, etc.)
Features(Offenders [15 Columns x 788 Rows]):
- incident_id: ID of incident, used to
connect the 4 data sets.
- offender_id: ID used for
classification of offenders (Mostly for multi-offender incidents)
- firstname
- middlename
- lastname
- suffix
- age
- race
- sex
- suicide: Whether the offender
commited suicide or not.
- deathcause: What did the offender die
by whether it was suicide or killed by a bystander/police
- outcome: What was the legal outcome
for the incident (ie. Acquitted, Arrested/Pending trial, Charges
dropped, etc.)
- criminal_justice_process: Status of
criminal justice process (ie. Arrested/Pending trial, charges dropped,
etc.)
- sentence_type: Type of sentence
convicted (ie. Acquitted, Awaiting sentencing, Committed, Death sentence
etc.)
- sentence_details: Details about the sentence convicted.
glimpse(incidents)
Rows: 615
Columns: 19
$ incident_id <int> 644, 643, 642, 639, 641, 638, 637, 636, 635, 634, 632, 631, 629, 630, 627, 626, 625, 624, 623, 620, 619, 618, 616, 615, 614, 613…
$ date <chr> "2025-02-21", "2025-02-10", "2025-01-01", "2024-12-14", "2024-12-14", "2024-12-07", "2024-11-16", "2024-11-10", "2024-11-07", "2…
$ city <chr> "Lake Station", "Byron", "New Orleans", "Belen", "West Valley", "DeKalb County", "Lancaster", "Wichita", "Duluth", "Fall City", …
$ state <chr> "IN", "WY", "LA", "NM", "UT", "GA", "CA", "KS", "MN", "WA", "AL", "GA", "IL", "NY", "NY", "GA", "FL", "NY", "AL", "AL", "CA", "K…
$ num_offenders <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 2, 1, 5, 1, 1, 1, 3, 1, 1,…
$ num_victims_killed <int> 4, 4, 14, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 5, 4, 5, 4, 5, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 5, 6, 4, 8, 4, 4, 6, 4…
$ num_victims_injured <int> 0, 0, 30, 0, 1, 0, 0, 0, 0, 1, 17, 9, 0, 0, 0, 0, 0, 0, 0, 10, 0, 3, 1, 9, 0, 0, 4, 0, 7, 0, 0, 7, 0, 0, 1, 2, 0, 0, 1, 0, 0, 3,…
$ firstcod <chr> "Shooting", "Shooting", "Vehicle crash", "Shooting", "Shooting", "Shooting", "Shooting", "Shooting", "Shooting", "Shooting", "Sh…
$ secondcod <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "Smoke inhalation & burns", "", "", "", "Stabbing", "", "", "", "", "", "", …
$ if_assault_rifle_used <chr> "Unknown gun type", "Gun not an assault rifle", "Nongun", "Gun not an assault rifle", "Unknown gun type", "Unknown gun type", "G…
$ type <chr> "Family", "Family", "Public", "Family", "Family", "Family", "Undetermined", "Family", "Family", "Family", "Suspected felony", "P…
$ situation_type <chr> "Family issue", "Despondency", "Terrorism", "", "", "Family issue", "", "", "", "Family issue", "Other", "Indiscriminate", "Unde…
$ location_type <chr> "Residence/Other shelter", "Residence/Other shelter", "Open space", "Residence/Other shelter", "Residence/Other shelter", "Resid…
$ location <chr> "Residence", "Residence", "Open space", "Residence", "Residence", "Residence", "Residence", "Residence", "Residence", "Residence…
$ longitude <dbl> -87.25547, -108.50082, -90.06991, -106.68182, -111.97256, -84.31904, -118.11650, -97.32347, -92.15961, -121.88448, -86.79633, -8…
$ latitude <dbl> 41.59381, 44.79471, 29.95388, 34.56935, 40.69044, 33.72203, 34.69170, 37.65839, 46.74984, 47.53114, 33.50061, 33.94773, 41.87422…
$ county <chr> "Lake County", "Big Horn County", "Orleans Parish", "Valencia County", "Salt Lake County", "DeKalb County", "Los Angeles County"…
$ census_tract_geoid <dbl> 18089041700, 56003962800, 22071013400, 35061971100, 49035113309, 13089023702, 6037900607, 20173003900, 27137015800, 53033032605,…
$ narrative <chr> "While conducting a welfare check at a Lake Station, IN, home, the police discovered the bodies of a man, a woman, a three child…
Filling in blank information for clarity.
unique(incidents$situation_type)
[1] "Family issue" "Despondency" "Terrorism" "" "Other" "Indiscriminate"
[7] "Undetermined" "Robbery" "Arson" "Interpersonal conflict" "Gang conflict" "Drug trade"
[13] "Employment grievance" "Hate" "Profit"
incidents$situation_type[incidents$situation_type == ""] <- "Unknown"
unique(incidents$situation_type)
[1] "Family issue" "Despondency" "Terrorism" "Unknown" "Other" "Indiscriminate"
[7] "Undetermined" "Robbery" "Arson" "Interpersonal conflict" "Gang conflict" "Drug trade"
[13] "Employment grievance" "Hate" "Profit"
Here is a quick diagram of the US (Excluding Alaska and Hawaii as
it’s not properly fitted) that showcases where all the incidents took
place. The plan is for me to eventually also turn this into a heat map
to give a better gauge as when there are multiple like this it is
difficult to showcase how much there is due to overlap. I am also
attempting two different ways of showing the data points. Eventually
I’ll figure out which would be the most effective method.
varI <- incidents %>%
select(c(incident_id, longitude, latitude, firstcod, situation_type, location, num_victims_killed, num_victims_injured)) %>%
rename(lon = longitude) %>%
rename(lat = latitude)
varI
transformedData <- usmap_transform(varI)
tempData <- transformedData %>%
filter(row_number() != 262)

# Excluding the 2017 Las Vegas Nevada shooter incident as it gets in the way of clarity.
tempData <- transformedData %>%
filter(row_number() != 262)
plot_usmap() + geom_sf(
data = tempData,
aes(color = firstcod, size = num_victims_injured),
alpha = .5) +
theme(legend.position = "right", legend.key.size = unit(1, 'mm'), plot.title=element_text(hjust=0.5, face = "bold")) +
labs(title = "PRIMARY reason for injured victims", caption = "-The 2017 Las Vegas Shooter has been excluded for clarity. That event had 60 deaths and 867 injured.")





# Getting a count of the incidents per state
stateIncidents <- incidents %>%
count(state)
# Setting a fill color variable dependent on the number of incidents
stateIncidents <- stateIncidents %>%
mutate(fillColor = ifelse(n>=40, "40+", ifelse(n>=20, "20-39", "0-19")))
Here is a bar graph statistic showcasing all of the incidents from
the data set and showing them from most to least in a bar graph form.
From this we can see that California, Texas and Illinois are the three
most dangerous (in the sense of the most mass murders occuring)
locations in the US. Florida being an honorable mention for fourth
place. From here on I could also then further test to check the amount
of victims, whilst this graph shows the most mass killings in occurrence
for the past ~20 years, that does not necessarily pertain to deadliness
as there may be other states that have less but more severe mass
murders.

# Getting a sum of the amount of both victims killed and injured per state
deadlynessPerState <- incidents %>%
group_by(state) %>%
summarize(totalDeaths = sum(num_victims_killed), totalInjured = sum(num_victims_injured))
# Now applying a manual fill variable to be able to apply to the graph later
deadlynessPerState <- deadlynessPerState %>%
mutate(fillColorDeath = ifelse(totalDeaths>=200, "200+", ifelse(totalDeaths>=100, "100-199", "0-99")),
fillColorInjury = ifelse(totalInjured>=200, "200+", ifelse(totalDeaths>=100, "100-199", "0-99")))


ggplot(weapons, aes(x = weapon_type)) +
geom_bar(width = 0.8)

With this Violin plot (+ Boxplot) we can see the general distribution
of the victims as well as the medians. Generally it seems that females
and males are both on average not too dissimilar but this is without
knowing the exact amount of numbers. Proportionally however, the age of
which they died does not seem to matter despite the sex. The unknown
seems to be cases which the bodies were recovered but maybe not able to
be identified as these cases have little to no information to them

I specifically remember this occurrence, I recall it being everywhere
on the news and so I tested out the ID and seeing how well I could
extract the data.
incidentsFL <- incidents %>%
filter(state == "FL")
incidentsFL
weapons %>%
filter(incident_id == 300)
victims %>%
filter(incident_id == 300)
offenders %>%
filter(incident_id == 300)
Here I was checking if there are any other discriminatory attacks
against gay people searching through the narratives for the term “gay”,
surprisingly, out of all the ones in the US only one showed up, which is
the same one from above that occurred in Orlando.
# With grepl I can search for specific words in the narrative or any other of the strings of texts that give narratives.
incidents %>%
filter(grepl("gay", narrative))
NA
I would consider this data set to be relatively clean. But,
there are some things that could be altered for the sake of working with
them. For example, there are a lot of NULL cells but due to the type of
data I do not plan to remove these rows, I will however probably change
them into “N/A” or something of the sort as they are important. There
are some features, for example - situation_type, which has
“Undetermines” and just flat out Null. Reading through the narratives,
my personal belief is that these are most likely incidents in which
there was simply no official statement of what type of situation the
incident was. This would be a situation in which I would alter the data
to officially state something like “Not Stated”
Discussion:
My general impression on this data set is that while it is very
interested, at least for me, it does pose the challenge of attempting to
do any sort of predictions. A lot of this data is simply fact but
because of the nature of the data I cannot seem to do any sort of time
series data as the dates could be considered as continuous but the
situations are separate and not linked together.
Regarding the type of models, I’m thinking of maybe using a type of
linear regression model to see if maybe there is any correlation with
certain features to determine/predetermine the potential lethality of a
situation based on the information it has given us. This would be tricky
as things like race and age are too broad and other features like
situation_type are generally hard to gague before the situation has
already escalated. That said I’ll attempt to at least see what I would
be able to predict like maybe what the odds are of survival with
specific body types (age, sex, race, etc.) have in their respective
situations. This could be done with the usage of linear regression.
fullMergedClean <- subset(fullMerged, select = -c(incident_id, location_type, longitude, latitude, census_tract_geoid, narrative, firstname, lastname, middlename, suffix, deathcause, outcome, criminal_justice_process, sentence_type, sentence_details, victim_id, weapon_id, ))
Error in c(incident_id, location_type, longitude, latitude, census_tract_geoid, :
argument 18 is empty
fullMergedClean$had_survivor <- ifelse(fullMergedClean$num_victims_injured > 0, "Yes", "No")
fullMergedClean$had_survivor <- as.factor(fullMergedClean$had_survivor)
non_numeric_cols
[1] "weekday" "state" "firstcod" "secondcod" "if_assault_rifle_used" "situation_type"
[7] "location" "race.x" "sex.x" "race.y" "sex.y" "vorelationship"
[13] "weapon_type" "gun_class" "gun_type" "had_survivor"
incidents
offenders
victims
weapons
FEATURE ENGINEER, GOING TO MERGE EVERYTHING Features wanted
incident_id, date, state, num_offenders, num_victims_killed,
num_victims_injured, firstcod, if_assault_rifle_used, situation_type,
location, offender_age, offender_race, offender_sex, victim_age,
victim_sex, vorelationship, weapon_type
ADD num_victims_total
Turn Categorical into dummy variables
# Fixing the date and adding extra date columns.
incidentsUpdated <- incidents %>%
mutate(
date = as.Date(date),
year = year(date),
month = month(date),
day = day(date),
weekday = weekdays(date)
)
incidentsUpdated <- na.omit(incidentsUpdated)
incidentsUpdated$had_survivor <- ifelse(incidentsUpdated$num_victims_injured > 0, "Yes", "No")
incidentsUpdated$had_survivor <- as.factor(incidentsUpdated$had_survivor)
# [Name] Summaries, these will get all the columns that will be used for the models.
incidentsSum <- incidentsUpdated %>%
subset(select = c(incident_id, date, year, month, day, weekday, state, num_offenders, num_victims_killed, num_victims_injured, firstcod, situation_type, location, had_survivor))
offenderSum <- offenders %>%
subset(select = c(incident_id, offender_id, age, race, sex)) %>%
rename(offenderAge = age) %>%
rename(offenderRace = race) %>%
rename(offenderSex = sex)
victimSum <- victims %>%
subset(select = c(incident_id, victim_id, age, race, sex, vorelationship)) %>%
rename(victimAge = age) %>%
rename(victimRace = race) %>%
rename(victimSex = sex)
weaponSum <- weapons %>%
subset(select = c(incident_id, weapon_id, weapon_type))
# Merging the data
totalData <- incidentsSum %>%
left_join(offenderSum, by = "incident_id") %>%
left_join(victimSum, by = "incident_id") %>%
left_join(weaponSum, by = "incident_id")
Warning: Detected an unexpected many-to-many relationship between `x` and `y`.Warning: Detected an unexpected many-to-many relationship between `x` and `y`.
totalData <- na.omit(totalData)
totalData
charEncode <- c("weekday", "state", "firstcod", "situation_type", "location", "offenderRace", "offenderSex", "victimRace", "victimSex", "vorelationship", "weapon_type")
# Turns any blank cells to "Unknown"
totalData[totalData == ""] <- "Unknown"
#any(is.na(totalData))
# This will factorize all of the data instead of having to manually do it
totalDataFactored <- totalData %>%
mutate(across(all_of(charEncode), ~ as.integer(factor(.x))))
# Sums the max value in all of the columns which should, in theory, give the total amount of distinct columns I should have.
sum(sapply(totalDataFactored[charEncode], max))
[1] 152
# Get all of the names and which column they were from
factorLookup <- totalData %>%
dplyr::select(charEncode) %>%
pivot_longer(cols = everything(),
names_to = "Origin",
values_to = "Name")
# Gathers all of the factored data
factorLookup2 <- totalDataFactored %>%
dplyr::select(charEncode) %>%
pivot_longer(cols = everything(),
names_to = NULL,
values_to = "Encoded Value")
# Assuming nothing has changed and having checked the number it matches up. I assume this is correct and now factorLookup is a good dictionary to go back to.
factorLookup <- cbind(factorLookup, factorLookup2)
factorLookup <- distinct(factorLookup)
factorLookup
splitData <- createDataPartition(binaryFactored$had_survivor, p = 0.8, list = FALSE)
trainData <- binaryFactored[splitData, ]
testData <- binaryFactored[-splitData, ]
# This is so that I can use it in the corr matrix but also make my life easier
totalDataFactoredForReal <- subset(totalDataFactored, select = -c(incident_id, date, year, month, offender_id, victim_id))
totalDataFactoredForReal$had_survivor <- as.numeric(totalDataFactoredForReal$had_survivor)
# Linear Regression Model
linModel <- lm(had_survivor ~ ., data = totalDataFactoredForReal)
summary(logModel)
Call:
lm(formula = had_survivor ~ ., data = totalDataFactoredForReal)
Residuals:
Min 1Q Median 3Q Max
-0.97413 -0.34860 -0.00111 0.30482 0.84998
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.705e+00 5.937e-02 28.724 < 2e-16 ***
day -4.216e-03 5.544e-04 -7.604 3.19e-14 ***
weekday 1.154e-02 2.514e-03 4.592 4.46e-06 ***
state -4.960e-04 3.283e-04 -1.511 0.13087
num_offenders -8.142e-03 3.775e-03 -2.157 0.03105 *
num_victims_killed 1.323e-02 7.730e-04 17.118 < 2e-16 ***
num_victims_injured -2.157e-04 5.540e-05 -3.893 9.97e-05 ***
firstcod 2.365e-02 5.646e-03 4.188 2.84e-05 ***
situation_type 1.169e-03 1.360e-03 0.859 0.39018
location -4.773e-02 1.499e-03 -31.844 < 2e-16 ***
offenderAge -2.790e-03 4.852e-04 -5.749 9.30e-09 ***
offenderRace 1.070e-03 2.895e-03 0.370 0.71176
offenderSex 1.233e-01 1.678e-02 7.350 2.17e-13 ***
victimAge 1.138e-03 2.484e-04 4.581 4.70e-06 ***
victimRace -2.621e-03 2.783e-03 -0.942 0.34631
victimSex -1.343e-02 8.880e-03 -1.513 0.13038
vorelationship -1.126e-04 5.381e-04 -0.209 0.83425
weapon_id -5.213e-05 1.702e-05 -3.062 0.00221 **
weapon_type -1.947e-02 4.101e-03 -4.748 2.09e-06 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.3851 on 8108 degrees of freedom
Multiple R-squared: 0.3731, Adjusted R-squared: 0.3718
F-statistic: 268.1 on 18 and 8108 DF, p-value: < 2.2e-16
corM <- cor(totalDataFactoredForReal)
corrplot(corM, method = "color")

heatmap(corM, symm = TRUE, main = "Correlation Heatmap")

library(randomForest)
Warning: package ‘randomForest’ was built under R version 4.4.3randomForest 4.7-1.2
Type rfNews() to see new features/changes/bug fixes.
Attaching package: ‘randomForest’
The following object is masked from ‘package:dplyr’:
combine
The following object is masked from ‘package:ggplot2’:
margin
logModel <- glm(had_survivor~ . -num_victims_killed -num_victims_injured -had_survivor, data = trainData, family = "binomial")
stepLogModel <- step(logModel, direction = "both")
Start: AIC=6468.25
had_survivor ~ (day + weekday + state + num_offenders + num_victims_killed +
num_victims_injured + firstcod + situation_type + location +
offenderAge + offenderRace + offenderSex + victimAge + victimRace +
victimSex + vorelationship + weapon_id + weapon_type) - num_victims_killed -
num_victims_injured - had_survivor
Df Deviance AIC
- victimAge 1 6434.3 6466.3
<none> 6434.2 6468.2
- firstcod 1 6436.6 6468.6
- victimRace 1 6436.7 6468.7
- weekday 1 6437.3 6469.3
- state 1 6437.4 6469.4
- victimSex 1 6439.0 6471.0
- situation_type 1 6439.1 6471.1
- num_offenders 1 6442.7 6474.7
- weapon_id 1 6446.1 6478.1
- weapon_type 1 6450.8 6482.8
- offenderRace 1 6452.2 6484.2
- vorelationship 1 6463.9 6495.9
- offenderSex 1 6487.0 6519.0
- offenderAge 1 6489.8 6521.8
- day 1 6718.2 6750.2
- location 1 7431.4 7463.4
Step: AIC=6466.3
had_survivor ~ day + weekday + state + num_offenders + firstcod +
situation_type + location + offenderAge + offenderRace +
offenderSex + victimRace + victimSex + vorelationship + weapon_id +
weapon_type
Df Deviance AIC
<none> 6434.3 6466.3
- firstcod 1 6436.7 6466.7
- victimRace 1 6436.9 6466.9
- weekday 1 6437.3 6467.3
- state 1 6437.4 6467.4
+ victimAge 1 6434.2 6468.2
- victimSex 1 6439.0 6469.0
- situation_type 1 6439.2 6469.2
- num_offenders 1 6442.7 6472.7
- weapon_id 1 6446.4 6476.4
- weapon_type 1 6450.8 6480.8
- offenderRace 1 6452.4 6482.4
- vorelationship 1 6464.8 6494.8
- offenderSex 1 6487.2 6517.2
- offenderAge 1 6490.5 6520.5
- day 1 6719.1 6749.1
- location 1 7489.2 7519.2
stepLogModelCV <- train(had_survivor ~ . -num_victims_killed -num_victims_injured -had_survivor, data = trainData, method = "glm", family = "binomial", trControl = train_control, metric = "ROC")
stepLogPred <- predict(stepLogModelCV, newdata = testData, type = "prob")
rocOb <- roc(testData$had_survivor, stepLogPred$Yes)
Setting levels: control = Yes, case = No
Setting direction: controls > cases
plot(rocOb, print.auc = TRUE,
col = "blue",
lwd = 3,
legacy.axes = TRUE,
main = "ROC Curve for Survival Prediction")
abline(a = 0, b = 1, lty = 2, col = "red")

auc(rocOb)
Area under the curve: 0.8204
---
title: "Project: Mass Murder Statistics"
output: html_notebook
author: "Halan Badilla Osorio"
---

```{r}
Sys.setenv(LANG = "en")
```


```{r}
library(tidyverse)
library(sf)
library(ggplot2)
library(maps)
library(mapproj)
library(usmap)
library(ISLR2)
library(factoextra)
library(MASS)
library(dplyr)
library(hexbin)
library(lubridate)
library(corrplot)
library(caret)
library(randomForest)
library(car)
library(pROC)
```

```{r}
setwd("C:/Users/halan/Desktop/FPU/SPring2025/StatisticalLearning")

incidents <- read.csv("mass_killing_incidents_public.csv")
offenders <- read.csv("mass_killing_offenders_public.csv")
victims <- read.csv("mass_killing_victims_public.csv")
weapons <- read.csv("mass_killing_weapons_public.csv")
```

```{r}
view(incidents)
view(offenders)
view(victims)
view(weapons)
```

DATA: Mass Killing Database 2006 - Present (AS OF MARCH 15th 2025)

Mass Murder for this dataset is defined as "...the intentional killing of four or more victims by any means within a 24-hour period, excluding the death of unborn children and the offender(s).)

This Database is constantly being updated (Was just updated 2 hours ago as of March 16th, 2025 at 12:50PM) and I would say is very trustworthy. The contact details for this Database is Justin Myers. Justin Myers is a veteran journalist for "The Associated Press", they are also working with Northeastern University's James Alan Fox whom is also a contact.



Features(Incident [19 Columns x 615 Rows]):

<span style="color:red">Red</span> = Categorical | <span style="color:green">Green</span> = Numerical

- <span style="color:red">incident_id</span>: ID of incident
- <span style="color:green">date</span>: Date of incident
- <span style="color:red">city</span>: City/Town name
- <span style="color:red">state</span>: State postal code
- <span style="color:green">num_offenders</span>: Number of offenders
- <span style="color:green">num_killed</span>: Number of victims killed
- <span style="color:green">num_injured</span>: Numnber of victims injured
- <span style="color:red">firstcod</span>: First cause of death
- <span style="color:red">secondcod</span>: Second cause of death
- <span style="color:red">if_assault_rifle</span>: Whether a rifle was used
- <span style="color:red">type</span>: Type of incident (ie. Family, Felony, Public, etc.)
- <span style="color:red">situation_type</span>: Type of situation (ie. Arson, Drug Trade, Family Issue, etc.)
- <span style="color:red">location_type</span>: Type of location (ie. Commercial/Retail/Entertainment, Government/Transit, House of worship, etc.)
- <span style="color:red">location</span>: Location (ie. Bar/Club/Restaurant, College, Commercial/Retail, etc.)
- <span style="color:green">longitude</span>
- <span style="color:green">latitude</span>
- <span style="color:red">county</span>: County associated with coordinates
- <span style="color:green">geocode</span>: FIPS geocode of location
- narrative: Description of incident


Features(Victims [6 Columns x 3174 Rows]):

- <span style="color:red">incident_id</span>: ID for incident, used to link the victims data.
- <span style="color:red">victim_id</span>: ID of the victim, for recording purposes.
- <span style="color:green">age</span>: Age of the victim.
- <span style="color:red">race</span>
- <span style="color:red">sex</span>
- <span style="color:red">vorelationship</span>: Relationship of victim to the offender

Features(Weapons [5 Columns x 963 Rows]):

- <span style="color:red">incident_id</span>: ID for incident, used to link weapon to incident.
- <span style="color:red">weapon_id</span>: ID for weapon identification.
- <span style="color:red">weapon_type</span>: Type of weapon.
- <span style="color:red">gun_class</span>: Classification of gun (Handgun, Long gun, Unknown gun class, Non-gun)
- <span style="color:red">gun_type</span>: Type of gun (ie. Handgun, Pistol, Revolver, Rifle, etc.)

Features(Offenders [15 Columns x 788 Rows]):

- <span style="color:red">incident_id</span>: ID of incident, used to connect the 4 data sets.
- <span style="color:red">offender_id</span>: ID used for classification of offenders (Mostly for multi-offender incidents)
- <span style="color:red">firstname</span>
- <span style="color:red">middlename</span>
- <span style="color:red">lastname</span>
- <span style="color:red">suffix</span>
- <span style="color:green">age</span>
- <span style="color:red">race</span>
- <span style="color:red">sex</span>
- <span style="color:red">suicide</span>: Whether the offender commited suicide or not.
- <span style="color:red">deathcause</span>: What did the offender die by whether it was suicide or killed by a bystander/police
- <span style="color:red">outcome</span>: What was the legal outcome for the incident (ie. Acquitted, Arrested/Pending trial, Charges dropped, etc.)
- <span style="color:red">criminal_justice_process</span>: Status of criminal justice process (ie. Arrested/Pending trial, charges dropped, etc.)
- <span style="color:red">sentence_type</span>: Type of sentence convicted (ie. Acquitted, Awaiting sentencing, Committed, Death sentence etc.)
- sentence_details: Details about the sentence convicted.

```{r}
glimpse(incidents)
```


Filling in blank information for clarity.

```{r}
unique(incidents$situation_type)

incidents$situation_type[incidents$situation_type == ""] <- "Unknown"

unique(incidents$situation_type)
```
Here is a quick diagram of the US (Excluding Alaska and Hawaii as it's not properly fitted) that showcases where all the incidents took place. The plan is for me to eventually also turn this into a heat map to give a better gauge as when there are multiple like this it is difficult to showcase how much there is due to overlap. I am also attempting two different ways of showing the data points. Eventually I'll figure out which would be the most effective method.
```{r}
varI <- incidents %>%
  select(c(incident_id, longitude, latitude, firstcod, situation_type, location, num_victims_killed, num_victims_injured)) %>%
  rename(lon = longitude) %>%
  rename(lat = latitude)

varI

transformedData <- usmap_transform(varI)
```


```{r}
tempData <- transformedData %>%
  filter(row_number() != 262)
```

```{r}
plot_usmap() + geom_sf(
  data = tempData,
  aes(color = firstcod, size = num_victims_killed),
  alpha = .5) +
  theme(legend.position = "right", legend.key.size = unit(1, 'mm'), plot.title=element_text(hjust=0.5, face = "bold")) +
  labs(title = "PRIMARY reason for victim deaths", caption = "-The 2017 Las Vegas Shooter has been excluded for clarity. That event had 60 deaths and 867 injured.")

```

```{r}
plot_usmap() + geom_sf(
  data = tempData,
  aes(color = firstcod, size = num_victims_injured),
  alpha = .5) +
  theme(legend.position = "right", legend.key.size = unit(1, 'mm'), plot.title=element_text(hjust=0.5, face = "bold")) +
  labs(title = "PRIMARY reason for injured victims", caption = "-The 2017 Las Vegas Shooter has been excluded for clarity. That event had 60 deaths and 867 injured.")

```

```{r}
plot_usmap() + geom_sf(
  data = tempData,
  aes(color = location, size = num_victims_killed),
  alpha = .5) +
  theme(legend.position = "right", legend.key.size = unit(1, 'mm'), plot.title=element_text(hjust=0.5, face = "bold")) +
  labs(title = "Location and Amount of Victim Deaths", caption = "-The 2017 Las Vegas Shooter has been excluded for clarity. That event had 60 deaths and 867 injured.")
```
```{r}
plot_usmap() + geom_sf(
  data = tempData,
  aes(color = location, size = num_victims_injured),
  alpha = .5) +
  theme(legend.position = "right", legend.key.size = unit(1, 'mm'), plot.title=element_text(hjust=0.5, face = "bold")) +
  labs(title = "Location and Amount of Injured Victims", caption = "-The 2017 Las Vegas Shooter has been excluded for clarity. That event had 60 deaths and 867 injured.")
```

```{r}
plot_usmap() + geom_sf(
  data = tempData,
  aes(color = situation_type, size = num_victims_killed),
  alpha = 0.8) +
  theme(legend.position = "right", legend.key.size = unit(1, 'mm'), plot.title=element_text(hjust=0.5, face = "bold")) +
  labs(title = "Type of Incident and Amount of Victim Deaths", caption = "-The 2017 Las Vegas Shooter has been excluded for clarity. That event had 60 deaths and 867 injured.")

```
```{r}
plot_usmap() + geom_sf(
  data = tempData,
  aes(color = situation_type, size = num_victims_injured),
  alpha = 0.8) +
  theme(legend.position = "right", legend.key.size = unit(1, 'mm'), plot.title=element_text(hjust=0.5, face = "bold")) +
  labs(title = "Type of Incident and Amount of Injured Victims", caption = "-The 2017 Las Vegas Shooter has been excluded for clarity. That event had 60 deaths and 867 injured.")

```

```{r}
# Getting a count of the incidents per state
stateIncidents <- incidents %>%
  count(state)

# Setting a fill color variable dependent on the number of incidents
stateIncidents <- stateIncidents %>%
  mutate(fillColor = ifelse(n>=40, "40+", ifelse(n>=20, "20-39", "0-19")))
```

Here is a bar graph statistic showcasing all of the incidents from the data set and showing them from most to least in a bar graph form. From this we can see that California, Texas and Illinois are the three most dangerous (in the sense of the most mass murders occuring) locations in the US. Florida being an honorable mention for fourth place. From here on I could also then further test to check the amount of victims, whilst this graph shows the most mass killings in occurrence for the past ~20 years, that does not necessarily pertain to deadliness as there may be other states that have less but more severe mass murders.

```{r}
# Creating the graph and organizing by amount
ggplot(stateIncidents, aes(x = reorder(state, +n), y = n, fill = fillColor)) + 
  geom_bar(stat = 'identity', width = 0.8) + 
  scale_fill_manual(values = c("0-19" = "gold", "20-39" = "orange", "40+" = "darkred")) +
  theme(axis.text.y = element_text(size = 7, hjust = 1)) +
  labs(title = "Total Incidents Per State", x = "States", y = "Amount of Incidents") +
  coord_flip()
```
```{r}
# Getting a sum of the amount of both victims killed and injured per state
deadlynessPerState <- incidents %>%
  group_by(state) %>%
  summarize(totalDeaths = sum(num_victims_killed), totalInjured = sum(num_victims_injured))

# Now applying a manual fill variable to be able to apply to the graph later
deadlynessPerState <- deadlynessPerState %>%
  mutate(fillColorDeath = ifelse(totalDeaths>=200, "200+", ifelse(totalDeaths>=100, "100-199", "0-99")),
         fillColorInjury = ifelse(totalInjured>=200, "200+", ifelse(totalInjured>=100, "100-199", "0-99")))
```

```{r}
ggplot(deadlynessPerState, aes(x = reorder(state, +totalDeaths), y = totalDeaths, fill = fillColorDeath)) +
  geom_bar(stat = 'identity', width = 0.8) +
  scale_fill_manual(values = c("0-99" = "gold", "100-199" = "orange", "200+" = "darkred")) +
  theme(axis.text.y = element_text(size = 7, hjust = 1)) +
  labs(title = "Total Victim Deaths Per State", x = "States", y = "Amount of Deaths") +
  coord_flip()
```

```{r}
ggplot(deadlynessPerState, aes(x = reorder(state, +totalInjured), y = totalInjured, fill = fillColorInjury)) +
  geom_bar(stat = 'identity', width = 0.8) +
  scale_fill_manual(values = c("0-99" = "gold", "100-199" = "orange", "200+" = "darkred")) +
  theme(axis.text.y = element_text(size = 7, hjust = 1)) +
  labs(title = "Total Injured Victims Per State", x = "States", y = "Amount of Injured") +
  coord_flip()
```


```{r}
ggplot(weapons, aes(x = weapon_type)) +
  geom_bar(width = 0.8)
```

With this Violin plot (+ Boxplot) we can see the general distribution of the victims as well as the medians. Generally it seems that females and males are both on average not too dissimilar but this is without knowing the exact amount of numbers. Proportionally however, the age of which they died does not seem to matter despite the sex. The unknown seems to be cases which the bodies were recovered but maybe not able to be identified as these cases have little to no information to them

```{r}
ggplot(victims, aes(x = sex, y = age, fill = sex)) +
  geom_violin(trim = FALSE) +
  geom_boxplot(width = 0.4) +
  geom_jitter(shape = 16, position=position_jitter(0.1), alpha = 0.2)
```


I specifically remember this occurrence, I recall it being everywhere on the news and so I tested out the ID and seeing how well I could extract the data. 

```{r}

incidentsFL <- incidents %>%
  filter(state == "FL")

incidentsFL

weapons %>%
  filter(incident_id == 300)

victims %>%
  filter(incident_id == 300)

offenders %>%
  filter(incident_id == 300)
```


Here I was checking if there are any other discriminatory attacks against gay people searching through the narratives for the term "gay", surprisingly, out of all the ones in the US only one showed up, which is the same one from above that occurred in Orlando.

```{r}
# With grepl I can search for specific words in the narrative or any other of the strings of texts that give narratives.
incidents %>%
  filter(grepl("gay", narrative))

```

```{r}

```

I would consider this data set to be *relatively* clean. But, there are some things that could be altered for the sake of working with them. For example, there are a lot of NULL cells but due to the type of data I do not plan to remove these rows, I will however probably change them into "N/A" or something of the sort as they are important. There are some features, for example - situation_type, which has "Undetermines" and just flat out Null. Reading through the narratives, my personal belief is that these are most likely incidents in which there was simply no official statement of what type of situation the incident was. This would be a situation in which I would alter the data to officially state something like "Not Stated" 



Discussion:

My general impression on this data set is that while it is very interested, at least for me, it does pose the challenge of attempting to do any sort of predictions. A lot of this data is simply fact but because of the nature of the data I cannot seem to do any sort of time series data as the dates could be considered as continuous but the situations are separate and not linked together.

Regarding the type of models, I'm thinking of maybe using a type of linear regression model to see if maybe there is any correlation with certain features to determine/predetermine the potential lethality of a situation based on the information it has given us. This would be tricky as things like race and age are too broad and other features like situation_type are generally hard to gague before the situation has already escalated. That said I'll attempt to at least see what I would be able to predict like maybe what the odds are of survival with specific body types (age, sex, race, etc.) have in their respective situations. This could be done with the usage of linear regression. 

```{r}
fullMerged = 0
fullMerged <- full_join(incidents, offenders, by = "incident_id")
fullMerged <- full_join(fullMerged, victims, by = "incident_id")
fullMerged <- full_join(fullMerged, weapons, by = "incident_id")

# x = Offender, y = Victim
fullMergedClean <- subset(fullMerged, select = -c(incident_id, location_type, longitude, latitude, census_tract_geoid, narrative, firstname, lastname, middlename, suffix, deathcause, outcome, criminal_justice_process, sentence_type, sentence_details, victim_id, weapon_id, offender_id, type))
```

```{r}
fullMergedClean$had_survivor <- ifelse(fullMergedClean$num_victims_injured > 0, "Yes", "No")
fullMergedClean$had_survivor <- as.factor(fullMergedClean$had_survivor)
```

```{r}
unique(fullMergedClean)

fullMergedClean$date <- ymd(fullMergedClean$date)
fullMergedClean$year <- year(fullMergedClean$date)
fullMergedClean$month <- month(fullMergedClean$date)
fullMergedClean$day <- days(fullMergedClean$date)
fullMergedClean$weekday <- weekdays(fullMergedClean$date)

factorCol <- c("weekday", "city", "state", "firstcod", "secondcod", "if_assault_rifle_used", "situation_type", "location", "county", "race.x", "sex.x", "race.y", "sex.y", "vorelationship", "weapon_type", "weapon_type", "gun_class", "gun_type")

fullMergedClean <- fullMergedClean |>
  mutate(across(all_of(factorCol), as.factor))

fullMergedClean$suicide <- as.numeric(fullMergedClean$suicide)

testMerge <- subset(fullMergedClean, select = c(weekday, state, num_offenders, num_victims_killed, num_victims_injured, firstcod, secondcod, if_assault_rifle_used, situation_type, location, age.x, race.x, sex.x, age.y, race.y, sex.y, vorelationship, weapon_type, gun_class, gun_type, had_survivor))

testMerge
```


```{r}
ldaTestModel <- lda(had_survivor ~ ., data = testMerge)
summary(ldaTestModel)

cor(testMerge)
```

```{r}
incidents
offenders
victims
weapons
```

FEATURE ENGINEER, GOING TO MERGE EVERYTHING
Features wanted
incident_id, date, state, num_offenders, num_victims_killed, num_victims_injured, firstcod, if_assault_rifle_used, situation_type, location, offender_age, offender_race, offender_sex, victim_age, victim_sex, vorelationship, weapon_type

ADD
num_victims_total

Turn Categorical into dummy variables

```{r}
# Fixing the date and adding extra date columns.
incidentsUpdated <- incidents %>%
  mutate(
    date = as.Date(date),
    year = year(date),
    month = month(date),
    day = day(date),
    weekday = weekdays(date)
  )

incidentsUpdated <- na.omit(incidentsUpdated)

incidentsUpdated$had_survivor <- ifelse(incidentsUpdated$num_victims_injured > 0, "Yes", "No")
incidentsUpdated$had_survivor <- as.factor(incidentsUpdated$had_survivor)

# [Name] Summaries, these will get all the columns that will be used for the models.
incidentsSum <- incidentsUpdated %>%
  subset(select = c(incident_id, date, year, month, day, weekday, state, num_offenders, num_victims_killed, num_victims_injured, firstcod, situation_type, location, had_survivor))

offenderSum <- offenders %>%
  subset(select = c(incident_id, offender_id, age, race, sex)) %>%
  rename(offenderAge = age) %>%
  rename(offenderRace = race) %>%
  rename(offenderSex = sex)

victimSum <- victims %>%
  subset(select = c(incident_id, victim_id, age, race, sex, vorelationship)) %>%
  rename(victimAge = age) %>%
  rename(victimRace = race) %>%
  rename(victimSex = sex)

weaponSum <- weapons %>%
  subset(select = c(incident_id, weapon_id, weapon_type))

# Merging the data
totalData <- incidentsSum %>%
  left_join(offenderSum, by = "incident_id") %>%
  left_join(victimSum, by = "incident_id") %>%
  left_join(weaponSum, by = "incident_id")

totalData <- na.omit(totalData)
totalData
```

```{r}
# List of column names I need to factorize
charEncode <- c("weekday", "state", "firstcod", "situation_type", "location", "offenderRace", "offenderSex", "victimRace", "victimSex", "vorelationship", "weapon_type")
```

```{r}
# Turns any blank cells to "Unknown"
totalData[totalData == ""] <- "Unknown"

#any(is.na(totalData))

# This will factorize all of the data instead of having to manually do it
totalDataFactored <- totalData %>%
  mutate(across(all_of(charEncode), ~ as.integer(factor(.x))))
```

```{r}
# Sums the max value in all of the columns which should, in theory, give the total amount of distinct columns I should have.
sum(sapply(totalDataFactored[charEncode], max))

# Get all of the names and which column they were from
factorLookup <- totalData %>%
  dplyr::select(charEncode) %>%
  pivot_longer(cols = everything(),
               names_to = "Origin",
               values_to = "Name") 

# Gathers all of the factored data
factorLookup2 <- totalDataFactored %>%
  dplyr::select(charEncode) %>%
  pivot_longer(cols = everything(),
               names_to = NULL,
               values_to = "Encoded Value")

# Assuming nothing has changed and having checked the number it matches up. I assume this is correct and now factorLookup is a good dictionary to go back to.
factorLookup <- cbind(factorLookup, factorLookup2)
factorLookup <- distinct(factorLookup)
factorLookup
```

```{r}
splitData <- createDataPartition(binaryFactored$had_survivor, p = 0.8, list = FALSE)
trainData <- binaryFactored[splitData, ]
testData <- binaryFactored[-splitData, ]
```


```{r, fig.width = 8, fig.height = 8}
# This is so that I can use it in the corr matrix but also make my life easier
totalDataFactoredForReal <- subset(totalDataFactored, select = -c(incident_id, date, year, month, offender_id, victim_id))
totalDataFactoredForReal$had_survivor <- as.numeric(totalDataFactoredForReal$had_survivor)

# Linear Regression Model
linModel <- lm(had_survivor ~ ., data = totalDataFactoredForReal)
summary(logModel)

corM <- cor(totalDataFactoredForReal)
corrplot(corM, method = "color")
heatmap(corM, symm = TRUE, main = "Correlation Heatmap")
```

```{r}
# K-Fold Train Control
set.seed(789)
train_control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, classProbs = TRUE, summaryFunction = twoClassSummary, verboseIter = FALSE)
```

```{r}
binaryFactored <- subset(totalDataFactored, select = -c(incident_id, date, year, month, offender_id, victim_id))
binaryFactored$had_survivor <- factor(totalDataFactored$had_survivor, levels = c("Yes", "No"))
binaryFactored

table(binaryFactored$had_survivor)

rfModel <- train(had_survivor ~ . -had_survivor -num_victims_killed -num_victims_injured, data = binaryFactored, method = "rf", trControl = train_control, ntree = 500)
print(rfModel)

```


```{r}
# Logistic Regression: Creating the initial model
logModel <- glm(had_survivor~ . -num_victims_killed -num_victims_injured -had_survivor, data = trainData, family = "binomial")

# Applying stepwise
stepLogModel <- step(logModel, direction = "both")

# Training the stepwise on kCV
stepLogModelCV <- train(had_survivor ~ . -num_victims_killed -num_victims_injured -had_survivor, data = trainData, method = "glm", family = "binomial", trControl = train_control, metric = "ROC")
```

```{r}
# Logistic Regression with Step Wise and kCV
stepLogPred <- predict(stepLogModelCV, newdata = testData, type = "prob")
rocOb <- roc(testData$had_survivor, stepLogPred$Yes)
plot(rocOb, print.auc = TRUE,
         col = "blue",
         lwd = 3,
         legacy.axes = TRUE,
         main = "ROC Curve for Survival Prediction")
abline(a = 0, b = 1, lty = 2, col = "red")
auc(rocOb)
```



